The purpose of this document is to examine codon usage patterns at the genome-wide level and for functional subsets, across four Strongyloides spp. and C. elegans.
For the functional subsets, six groups are analyzed:
1) Top 2% of codon adapted genes in each genome
2) Bottom 2% of codon adapted genes in each genome
3) Top 2% of GC-rich genes in each genome
4) Top 2% of AT-rich genes in each genome
Full lists of CDS sequences for S. stercoralis, S. ratti, S. papillosus, S. venezuelensis and C. elegans were download from Wormbase ParaSite (v15) on November 17, 2020. The .fa files were used as input to the Strongyloides Codon Adapter App. For each species an excel report containing the computed codon adaptation index values relative to S. ratti and C. elegans was downloaded; these file are uploaded the code chunks below.
For each species, load data listing GC content and CAI indeces for the full list of CDS elements in genome. The full list contains transcript level information. If downstream tidy version of this data is avliable or the filtered versions are avaliable as files, don’t regenerate them.
if (!file.exists(c("../Data/tidy_genomic_CAI.csv"))) {
temp <- c(Ss = '../Data/Ss_Codon_Usage_Report.xlsx',
Sr = '../Data/Sr_Codon_Usage_Report.xlsx',
Sp = '../Data/Sp_Codon_Usage_Report.xlsx',
Sv = '../Data/Sv_Codon_Usage_Report.xlsx',
Bm = '../Data/Bm_Codon_Usage_Report.xlsx',
Ce = '../Data/Ce_Codon_Usage_Report.xlsx')
dat.genomic <- sapply(temp, read_excel,
sheet = 1,
skip = 3,
col_names = T,
simplify = F,
USE.NAMES = T)
# This data contains transcript level information.
# Commented code would trim rows such that in cases where there are
# multiple transcripts per gene, only
# values from the longest transcript would be included.
# This was originally included to permit comparison to gene expression databases
# but is no longer required.
dat.genomic.cleaned <- lapply(dat.genomic, function(x){
x %>%
dplyr::mutate(transcriptID = geneID)
# dplyr::rename(transcriptID = geneID) %>%
# dplyr::mutate(geneID = str_remove_all(transcriptID, "\\.[0-9]$")) %>%
# dplyr::mutate(geneID = str_remove_all(geneID, "[a-c]$")) %>%
# dplyr::group_by(geneID) %>%
# dplyr::slice_max(n = 1, order_by = str_length(`cDNA sequence`),
# with_ties = FALSE) %>%
#dplyr::relocate(geneID, .before = everything())
})
names(dat.genomic.cleaned) <- c("Ss", "Sr", "Sp", "Sv", "Bm", "Ce")
}
Tidy the genomic CAI values for downstream plotting and analysis. This can be computationally intensive, so if the code is run, a tidy version will get saved at the end of the Markdown file as a .csv file. If that file exists, don’t re-run this code.
if (!file.exists(c("../Data/tidy_genomic_CAI.csv"))) {
dat.genomic.df <- dat.genomic.cleaned %>%
map("geneID") %>%
unlist() %>%
as_tibble_col(column_name = "geneID") %>%
group_by(geneID)
dat.genomic.df <- dat.genomic.cleaned %>%
map("transcriptID") %>%
unlist() %>%
as_tibble_col(column_name = "transcriptID") %>%
add_column(dat.genomic.df, .)
dat.genomic.df <- dat.genomic.cleaned %>%
map("Sr_CAI") %>%
unlist() %>%
as_tibble_col(column_name = "Sr_CAI") %>%
add_column(dat.genomic.df, .)
dat.genomic.df <- dat.genomic.cleaned %>%
map("Bm_CAI") %>%
unlist() %>%
as_tibble_col(column_name = "Bm_CAI") %>%
add_column(dat.genomic.df, .)
dat.genomic.df <- dat.genomic.cleaned %>%
map("Ce_CAI") %>%
unlist() %>%
as_tibble_col(column_name = "Ce_CAI") %>%
add_column(dat.genomic.df, .)
dat.genomic.df <- dat.genomic.cleaned %>%
map("GC") %>%
unlist() %>%
as_tibble_col(column_name = "GC") %>%
add_column(dat.genomic.df, .)
# Add the column that species which species the gene is from
# by matching to the original lists of genes,
# the group
dat.genomic.df <- dat.genomic.df %>%
dplyr::mutate(species = case_when(
transcriptID %in% dat.genomic$Ss$geneID ~ 'Ss',
transcriptID %in% dat.genomic$Sr$geneID ~ 'Sr',
transcriptID %in% dat.genomic$Sp$geneID ~ 'Sp',
transcriptID %in% dat.genomic$Sv$geneID ~ 'Sv',
transcriptID %in% dat.genomic$Bm$geneID ~ 'Bm',
transcriptID %in% dat.genomic$Ce$geneID ~ 'Ce',
))
dat.genomic.df <- group_by(dat.genomic.df, species)
}
# Save a tidy version of the complied genomic CAI value dataset if it doesn't exist
if (!file.exists(c("../Data/tidy_genomic_CAI.csv"))) {
write.table(dat.genomic.df,
file = "../Data/tidy_genomic_CAI.csv",
sep = ",",
col.names = T,
row.names = FALSE)
}
This section takes the calculated codon adaptation index values for all predicted coding sequences, and generates a density curve for each species. Ideally, this plot could be used to benchmark the calculated CAI values against other metrics of codon bias in these species. For example, Mitreva et al 2006 reported effective number of codons (ENC) as a measure of gene-wise codon bias, across species. They report that “mean ENC across all sampled nemtaode species is 46.7 +/- 5.1” but that S. stercoralis and S. ratti are outliers with low ENC values (~35). Note: ENC values range from 20 for highly biased to 61 for no bias. Thus, benchmarking from Mitreva et al 2006, we might expect that Strongyloides species will have greater codon bias than C. elegans. Unfortunately, although they do report measuring distribution of ENC values across all transcripts for each species, they only have plots for S. ratti, P. trichosuri and P. pacificus.
dat.genomic.df <- suppressWarnings(
read.csv("../Data/tidy_genomic_CAI.csv",
header = TRUE)) %>%
as_tibble()
species.specific.dat.df <- dat.genomic.df %>%
dplyr::mutate(species_CAI = case_when(
species == 'Ce' ~ Ce_CAI,
species == 'Bm' ~ Bm_CAI,
TRUE ~ Sr_CAI
)) %>%
dplyr::mutate(outgroup_CAI = case_when(
species == 'Ce' ~ Sr_CAI,
species != 'Ce' ~ Ce_CAI
)) %>%
#dplyr:: select(geneID, species_CAI, species) %>%
group_by(species)
plot_distributions <- function(dat,
x,
title,
xlabel,
caption = "",
ylabel = "Density (scaled to maximum of 1)",
xlim = c(0,1),
ylim = c(0,1)) {
dat %>%
ggplot(aes(x = x, y = after_stat(ndensity), color = species)) +
geom_freqpoly(bins = 50) +
scale_color_manual(values = c("seagreen4", "grey4","steelblue4",
"coral4", "darkgoldenrod4",
"darkorchid4"),
name = "Species",
labels = c("B. malayi","C. elegans", "S. papillosus",
"S. ratti", "S. stercoralis",
"S. venezuelensis")) +
labs(title = title,
subtitle = "Data includes all predicted coding sequences",
x = xlabel,
y = ylabel,
caption = caption) +
coord_equal(xlim = xlim, ylim = ylim) +
theme_bw() +
theme(plot.title.position = "plot",
plot.caption.position = "panel",
plot.title = element_text(face = "bold",
size = 10, hjust = 0),
plot.subtitle = element_text(size = 8),
legend.title = element_text(size = 8),
axis.title = element_text(size = 8),
axis.text = element_text(size = 8),
axis.text.x = element_text(angle = 45, hjust = 1))
}
species_adaptiveness_plot <- plot_distributions(species.specific.dat.df,
species.specific.dat.df$species_CAI,
"Species-wide codon adaptiveness",
"Codon bias relative to \n genus coding usage rules (CAI)",
"Strongyloides species values relative to S. ratti usage rules
C. elegans values relative to C. elegans usage rules.
B. malayi values relative to B. malayi ussage rules.")
species_adaptiveness_plot
Notes re: the above plot - C. elegans shows overall lower codon bias compared to the Strongyloides species, as predicted from the Mitreva et al 2006 results. Reminder that Mitreva et al reported that this difference is greatly affected by the GC content; more AT rich speices like the Strongyloides species have greater codon usage biases. Also, it looks like the Strongyloides subclades (S. ratti - S. stercoralis; S. papillosus - S. venezuelensis) cluster together.
# Kruskal-Wallis testing of genomic CAI
one.way <- kruskal.test(species_CAI ~ species, data = species.specific.dat.df)
# Posthoc Dunn tests with Bonferroni adjustment
post.hoc <- dunn.test::dunn.test(species.specific.dat.df$species_CAI, species.specific.dat.df$species, method = "bonferroni")
## Kruskal-Wallis rank sum test
##
## data: x and group
## Kruskal-Wallis chi-squared = 67968.8801, df = 5, p-value = 0
##
##
## Comparison of x by group
## (Bonferroni)
## Col Mean-|
## Row Mean | Bm Ce Sp Sr Ss
## ---------+-------------------------------------------------------
## Ce | 239.1902
## | 0.0000*
## |
## Sp | 140.9085 -90.93966
## | 0.0000* 0.0000*
## |
## Sr | 59.54179 -156.7374 -71.05939
## | 0.0000* 0.0000* 0.0000*
## |
## Ss | 75.88232 -141.8075 -55.85025 14.81922
## | 0.0000* 0.0000* 0.0000* 0.0000*
## |
## Sv | 137.4337 -89.24383 -0.668115 69.17591 54.20444
## | 0.0000* 0.0000* 1.0000 0.0000* 0.0000*
##
## alpha = 0.05
## Reject Ho if p <= alpha/2
This section takes the calculated GC content values for all predicted coding sequences, and generates a density curve for each species.
species.specific.GC.df <- dat.genomic.df %>%
dplyr:: select(geneID, GC, species) %>%
group_by(species)
species_GC_plot <- plot_distributions(species.specific.GC.df,
species.specific.GC.df$GC,
"Species-wide GC content",
"GC content")
species_GC_plot
# Kruskal-Wallis testing of genomic GC
one.way <- kruskal.test(GC ~ species, data = species.specific.GC.df)
# Posthoc Dunn tests with Bonferroni adjustment
post.hoc <- dunn.test::dunn.test(species.specific.GC.df$GC, species.specific.GC.df$species, method = "bonferroni")
## Kruskal-Wallis rank sum test
##
## data: x and group
## Kruskal-Wallis chi-squared = 67947.0291, df = 5, p-value = 0
##
##
## Comparison of x by group
## (Bonferroni)
## Col Mean-|
## Row Mean | Bm Ce Sp Sr Ss
## ---------+-------------------------------------------------------
## Ce | -47.12620
## | 0.0000*
## |
## Sp | 92.70032 157.3087
## | 0.0000* 0.0000*
## |
## Sr | 130.6162 190.8436 48.50346
## | 0.0000* 0.0000* 0.0000*
## |
## Ss | 125.6135 186.4502 42.18947 -6.400016
## | 0.0000* 0.0000* 0.0000* 0.0000*
## |
## Sv | 98.32093 161.7153 7.855613 -40.54231 -34.22333
## | 0.0000* 0.0000* 0.0000* 0.0000* 0.0000*
##
## alpha = 0.05
## Reject Ho if p <= alpha/2
Take genome-wide CAI values, and for each species filter the top 2% and also bottom 2% of codon adapted genes. Will run GO analysis on these to determine whether there are enriched GO Terms.
percentile = 0.02
# For following analyses, only look at Strongyloides and C. elegans, not B. malayi
species.specific.dat.df<-species.specific.dat.df %>%
dplyr::filter(species != "Bm")
dat.Top <- species.specific.dat.df %>%
#dplyr::filter(species != "Ce") %>%
dplyr::mutate(Description = factor("Top")) %>%
dplyr::group_by(Description, species) %>%
dplyr::slice_max(order_by = species_CAI, prop = percentile) %>%
dplyr::mutate(species = factor(species, levels = c("Ss", "Sr", "Sp", "Sv","Ce")))
tally(dat.Top)
dat.Bot <- species.specific.dat.df %>%
# dplyr::filter(species != "Ce") %>%
dplyr::mutate(Description = factor("Bot")) %>%
dplyr::group_by(Description, species) %>%
dplyr::slice_max(order_by = desc(species_CAI), prop = percentile) %>%
dplyr::mutate(species = factor(species, levels = c("Ss", "Sr", "Sp", "Sv","Ce")))
tally(dat.Bot)
This section/plot asks questions like: Are the most-codon-adapted Strongyloides genes less codon adapted relative to the C. elegans usage rules? Similarly, are the least-codon-adapted genes less codon adapted relative to the other usage rule. Also, is there a consistent relationship between the degree of codon adaptation and GC content (a common confounding factor).
tbl <- rbind(dat.Top, dat.Bot) %>%
mutate(Description = recode(Description, Top = "Most-adapted genes",
Bot = "Least-adapted genes"))
ggplot(tbl, aes(species_CAI,GC, Description)) +
geom_point(data = species.specific.dat.df,
mapping = aes(x = species_CAI, y = GC),color = "grey", shape = 1) +
geom_point(mapping = aes(color = species, shape = Description)) +
#geom_jitter(mapping = aes(color = species), shape = 1) +
coord_fixed (3) +
facet_grid( ~species) +
scale_color_hue (l = 40) +
scale_shape_manual(values = c(1:2)) +
labs(x = "Codon bias relative to genus usage rules",
y = "G+C ratio",
title = "G+C ratio as a function of codon adaptiveness (genus rules)",
subtitle = "grey icons = all genes \n colored icons = most and least adapted genes ") +
theme_bw() +
theme(plot.title.position = "plot",
plot.caption.position = "panel",
plot.title = element_text(face = "bold",
size = 8, hjust = 0),
plot.subtitle = element_text(size = 8),
legend.title = element_text(size = 8),
axis.title = element_text(size = 8),
axis.text = element_text(size = 8),
axis.text.x = element_text(angle = 45, hjust = 1))
# tbl <- pivot_longer(tbl,
# cols = c("species_CAI", "outgroup_CAI"),
# names_to = "value",
# values_to = "CAI"
# )
ggplot(tbl, aes(outgroup_CAI,GC, Description)) +
geom_point(mapping = aes(color = species, shape = Description)) +
#geom_jitter(mapping = aes(color = species), shape = 1) +
coord_equal()+
facet_wrap( ~species) +
scale_color_hue (l = 40) +
scale_shape_manual(values = c(1:2)) +
labs(x = "Codon bias relative to non-genus usage rules",
y = "G+C ratio",
title = "G+C ratio as a function of codon adaptiveness (non-genus rules)") +
theme_bw() +
theme(plot.title.position = "plot",
plot.caption.position = "panel",
plot.title = element_text(face = "bold",
size = 8, hjust = 0),
plot.subtitle = element_text(size = 8),
legend.title = element_text(size = 8),
axis.title = element_text(size = 8),
axis.text = element_text(size = 8))
Do genes with the top and bottom 2% of CAI values show particularly high or low expression levels? We might expect that genes with the top 2% of CAI values, where the CAI is based on highly expressed S. ratti ESTs might tend to show higher expression levels.
temp <- c(Ss = '../Data/SsRNAseq_log2cpm_filtered_norm_voom.csv',
Sr = '../Data/SrRNAseq_log2cpm_filtered_norm_voom.csv',
Sp = '../Data/SpRNAseq_log2cpm_filtered_norm_voom.csv',
Sv = '../Data/SvRNAseq_log2cpm_filtered_norm_voom.csv')
dat.expression <- sapply(temp, read.csv,
header = TRUE)
species_names <- names(dat.expression)
dat.Top.expression <- lapply(species_names, function(x) {
dat.Top.select <- dplyr::filter(dat.Top, species == x) %>%
dplyr::mutate(geneID = str_remove_all(geneID, "\\.[0-9]$"))
subset.expression <- dat.expression[[x]] %>%
dplyr::filter(X %in% dat.Top.select$geneID) %>%
dplyr::rename(geneID = X) %>%
pivot_longer(cols = -geneID,
names_to = "samples",
values_to = "expression") %>%
tidyr::separate(samples,c("stage", NA)) %>%
dplyr::group_by(geneID, stage) %>%
dplyr::summarize(expression = median(expression))
})
names(dat.Top.expression) <- species_names
dat.Top.expression <- map_dfr(dat.Top.expression, bind_rows, .id = "species")
dat.Bot.expression <- lapply(species_names, function(x) {
dat.Bot.select <- dplyr::filter(dat.Bot, species == x) %>%
dplyr::mutate(geneID = str_remove_all(geneID, "\\.[0-9]$"))
subset.expression <- dat.expression[[x]] %>%
dplyr::filter(X %in% dat.Bot.select$geneID) %>%
dplyr::rename(geneID = X) %>%
pivot_longer(cols = -geneID,
names_to = "samples",
values_to = "expression") %>%
tidyr::separate(samples,c("stage", NA)) %>%
dplyr::group_by(geneID, stage) %>%
dplyr::summarize(expression = median(expression))
})
names(dat.Bot.expression) <- species_names
dat.Bot.expression <- map_dfr(dat.Bot.expression, bind_rows, .id = "species")
dat.expression <- lapply(species_names, function(x) {
dat.expression[[x]] %>%
dplyr::rename(geneID = X) %>%
pivot_longer(cols = -geneID,
names_to = "samples",
values_to = "expression") %>%
tidyr::separate(samples,c("stage", NA)) %>%
dplyr::group_by(geneID, stage) %>%
dplyr::summarize(expression = median(expression))
})
names(dat.expression) <- species_names
dat.expression <- map_dfr(dat.expression, bind_rows, .id = "species")
lapply(species_names, function(x){
plotdata.subset1 <- filter(dat.Top.expression, species == x)
plotdata.subset2 <- filter(dat.Bot.expression, species == x)
plotdata.all <- filter(dat.expression, species == x)
plot.df <- bind_rows(HighCAI = plotdata.subset1,
LowCAI = plotdata.subset2,
AllGenes = plotdata.all, .id = "Description")
p.Bot.expression <- ggplot(plot.df) +
aes(x=stage, y=expression, color = Description, fill = Description) +
geom_violin(show.legend = T,
alpha = .5,
position = position_dodge(width = 1),
trim = FALSE) +
scale_color_manual(values = c("black", "coral4", "steelblue4")) +
scale_fill_manual(values = c("grey", "coral4", "steelblue4")) +
stat_summary(fun = "median",
geom = "point",
shape = 20,
size = 2,
position = position_dodge(width = 1),
show.legend = FALSE) +
facet_grid(~stage, scales = "free") +
labs(y="log2CPM Expression", x = "Life Stage",
title = paste0(x, " - log2 Counts per Million (CPM)"),
caption=paste0("produced on ", Sys.time())) +
theme_bw()
suppressMessages(ggsave(paste0(x,"_CAIExpression.pdf"),
plot = p.Bot.expression,
device = "pdf",
height = 4,
width = 7,
path = output.path,
useDingbats=FALSE))
p.Bot.expression
})
## [[1]]
##
## [[2]]
##
## [[3]]
##
## [[4]]
Does CAI value correspond with expression in free-living females
temp <- lapply(species_names, function(x) {
dat.select <- dplyr::filter(species.specific.dat.df,
species == x) %>%
dplyr::mutate(geneID = str_remove_all(geneID, "\\.[0-9]$")) %>%
dplyr::ungroup() %>%
dplyr::select(geneID, species_CAI)
subset.expression <- dat.expression %>%
dplyr::filter(species == x)%>%
dplyr::filter(stage == "FLF") %>%
dplyr::filter(geneID %in% dat.select$geneID)
full_join(dat.select, subset.expression, by = "geneID") %>%
dplyr::select(!stage)
})
names(temp) <- species_names
caiExp.df <- map_dfr(temp, bind_rows, .id = "species")
caiExp.df <- bind_rows(AllGenes = caiExp.df,
HighCAI = dplyr::filter(caiExp.df, geneID %in% dat.Top.expression$geneID),
LowCAI = dplyr::filter(caiExp.df, geneID %in% dat.Bot.expression$geneID),
.id = "Description") %>%
dplyr::mutate(species = as_factor(species))
caiExp.highExp_plot <- ggplot(caiExp.df) +
aes(x=Description, y=expression, color = Description, fill = Description) +
geom_violin(show.legend = T,
position = position_dodge(width = 1),
trim = FALSE,
color = 'black') +
#scale_color_manual(values = c("black", "coral4", "steelblue4")) +
scale_fill_manual(values = c("grey", "coral4", "steelblue4")) +
stat_summary(fun = "median",
geom = "point",
shape = 20,
size = 2,
position = position_dodge(width = 1),
color = "black",
show.legend = FALSE) +
facet_grid( ~species) +
labs(y="log2CPM Expression", x = "Gene subsets (base on CAI value) by species",
title = "log2 Counts per Million (CPM) in free-living female as a function of codon bias",
subtitle = "grey = all genes \ncolors = top/bottom 2% CAI",
caption=paste0("produced on ", Sys.time())) +
theme_bw() +
theme(plot.title.position = "plot",
plot.caption.position = "panel",
plot.title = element_text(face = "bold",
size = 8, hjust = 0),
plot.subtitle = element_text(size = 8),
legend.title = element_text(size = 8),
axis.title = element_text(size = 8),
axis.text = element_text(size = 8),
axis.text.x = element_text(angle = 45, hjust = 1))
caiExp.highExp_plot
suppressPackageStartupMessages({library(car)})
cpm.merged <- rbind(dat.expression %>% dplyr::mutate(condition = "AllGenes"),
dat.Top.expression %>% dplyr::mutate(condition = "HighCPM"),
dat.Bot.expression %>% dplyr::mutate(condition = "LowCPM")) %>%
dplyr::mutate(condition = as_factor(condition))%>%
group_by(species, geneID, condition, stage)
options(contrasts = c("contr.sum", "contr.poly"))
lapply(species_names, function(x) {
cpm.merged.sub <- filter(cpm.merged, species == x)
res.aov2<- aov(expression ~ condition * stage, data = cpm.merged.sub)
Anova(res.aov2, type = "III")
res.aov3<-TukeyHSD(res.aov2, "condition:stage")
as_tibble(res.aov3$`condition:stage`, rownames = "comparison") %>%
separate(col = comparison, into = c("comp1", "LS1", "comp2", "LS2")) %>%
dplyr::filter(LS1 == LS2) %>%
unite(temp1, comp1, LS1, sep = ":")%>%
unite(temp2, comp2, LS2, sep = ":")%>%
unite(comparison, temp1, temp2, sep = "-")
})
## [[1]]
## # A tibble: 21 x 5
## comparison diff lwr upr `p adj`
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 HighCPM:FLF-AllGenes:FLF 1.79 0.960 2.63 3.50e-12
## 2 LowCPM:FLF-AllGenes:FLF -1.45 -2.41 -0.497 1.20e- 5
## 3 LowCPM:FLF-HighCPM:FLF -3.25 -4.50 -1.99 2.38e-13
## 4 HighCPM:iL3-AllGenes:iL3 0.665 -0.168 1.50 3.43e- 1
## 5 LowCPM:iL3-AllGenes:iL3 -2.05 -3.00 -1.09 4.68e-12
## 6 LowCPM:iL3-HighCPM:iL3 -2.71 -3.97 -1.45 3.04e-12
## 7 HighCPM:iL3a-AllGenes:iL3a 1.10 0.266 1.93 4.83e- 4
## 8 LowCPM:iL3a-AllGenes:iL3a -1.76 -2.71 -0.801 1.14e- 8
## 9 LowCPM:iL3a-HighCPM:iL3a -2.86 -4.11 -1.60 3.64e-13
## 10 HighCPM:PF-AllGenes:PF 0.943 0.109 1.78 9.21e- 3
## # … with 11 more rows
##
## [[2]]
## # A tibble: 12 x 5
## comparison diff lwr upr `p adj`
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 HighCPM:FLF-AllGenes:FLF 1.57 0.785 2.35 3.69e- 9
## 2 LowCPM:FLF-AllGenes:FLF -4.00 -4.93 -3.08 0.
## 3 LowCPM:FLF-HighCPM:FLF -5.57 -6.77 -4.37 0.
## 4 HighCPM:FLM-AllGenes:FLM 3.22 2.44 4.00 0.
## 5 LowCPM:FLM-AllGenes:FLM -2.72 -3.65 -1.80 8.69e-14
## 6 LowCPM:FLM-HighCPM:FLM -5.94 -7.14 -4.74 0.
## 7 HighCPM:iL3-AllGenes:iL3 0.607 -0.174 1.39 3.15e- 1
## 8 LowCPM:iL3-AllGenes:iL3 -2.77 -3.70 -1.85 1.23e-13
## 9 LowCPM:iL3-HighCPM:iL3 -3.38 -4.58 -2.18 1.36e-13
## 10 HighCPM:PF-AllGenes:PF 0.175 -0.605 0.956 1.00e+ 0
## 11 LowCPM:PF-AllGenes:PF -3.93 -4.85 -3.01 0.
## 12 LowCPM:PF-HighCPM:PF -4.11 -5.31 -2.91 2.93e-14
##
## [[3]]
## # A tibble: 18 x 5
## comparison diff lwr upr `p adj`
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 HighCPM:FLF-AllGenes:FLF 2.60 1.86 3.33 0.
## 2 LowCPM:FLF-AllGenes:FLF -0.806 -1.66 0.0522 9.58e- 2
## 3 LowCPM:FLF-HighCPM:FLF -3.40 -4.52 -2.28 1.69e-13
## 4 HighCPM:flL1L2-AllGenes:flL1L2 3.67 2.94 4.41 0.
## 5 LowCPM:flL1L2-AllGenes:flL1L2 -1.76 -2.62 -0.901 1.31e-10
## 6 LowCPM:flL1L2-HighCPM:flL1L2 -5.43 -6.55 -4.31 0.
## 7 HighCPM:FLM-AllGenes:FLM 2.60 1.86 3.33 0.
## 8 LowCPM:FLM-AllGenes:FLM -1.50 -2.36 -0.642 1.60e- 7
## 9 LowCPM:FLM-HighCPM:FLM -4.10 -5.22 -2.98 0.
## 10 HighCPM:iL3-AllGenes:iL3 1.04 0.307 1.78 1.11e- 4
## 11 LowCPM:iL3-AllGenes:iL3 -1.53 -2.39 -0.669 8.02e- 8
## 12 LowCPM:iL3-HighCPM:iL3 -2.57 -3.69 -1.45 4.28e-13
## 13 HighCPM:PF-AllGenes:PF 1.92 1.18 2.65 2.29e-13
## 14 LowCPM:PF-AllGenes:PF -0.938 -1.80 -0.0804 1.61e- 2
## 15 LowCPM:PF-HighCPM:PF -2.85 -3.97 -1.73 1.29e-13
## 16 HighCPM:pL1L2-AllGenes:pL1L2 3.65 2.91 4.38 0.
## 17 LowCPM:pL1L2-AllGenes:pL1L2 -1.75 -2.61 -0.897 1.48e-10
## 18 LowCPM:pL1L2-HighCPM:pL1L2 -5.40 -6.52 -4.28 0.
##
## [[4]]
## # A tibble: 6 x 5
## comparison diff lwr upr `p adj`
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 HighCPM:FLF-AllGenes:FLF 1.12 0.544 1.70 5.00e- 7
## 2 LowCPM:FLF-AllGenes:FLF -1.55 -2.32 -0.789 1.05e- 7
## 3 LowCPM:FLF-HighCPM:FLF -2.68 -3.63 -1.72 9.73e-14
## 4 HighCPM:PF-AllGenes:PF 0.493 -0.0869 1.07 1.48e- 1
## 5 LowCPM:PF-AllGenes:PF -1.61 -2.37 -0.846 2.91e- 8
## 6 LowCPM:PF-HighCPM:PF -2.10 -3.06 -1.15 4.66e- 9
Take lists of the top percentile of codon adapted genes in each species, and run GO analysis to see if these genes are enriched for anything interesting.
# Carry out GO enrichment using gProfiler2 ----
run_GO <- function (data){
GO.geneID <- data %>%
ungroup() %>%
group_by(species) %>%
dplyr::group_map(~ {
gost.res <- .x %>%
dplyr::select(geneID, GC) %>%
unique()
}, .keep = TRUE)
names(GO.geneID) <- levels(data$species)
gost.results <- lapply(names(GO.geneID), function(y){
organism = case_when (y == "Ss" ~ "ststerprjeb528",
y == "Sr" ~ "strattprjeb125",
y == "Sp" ~ "stpapiprjeb525",
y == "Sv" ~ "stveneprjeb530",
y == "Ce" ~ "caelegprjna13758")
gost.res <- gost(GO.geneID[[y]]$geneID,
organism = organism,
evcodes = TRUE,
correction_method = "fdr")
})
names(gost.results) <- names(GO.geneID)
gost.results
}
find_enrichments <- function (gost.results) {
# Find GO terms that are enriched with p-values equal or small than a threshold.
enriched.terms <- lapply(names(gost.results), function(y){
gost.results[[y]]$result %>%
as_tibble() %>%
dplyr::select(term_id, p_value)
})
names(enriched.terms) <- names(gost.results)
enriched.terms
}
find_common_enrichments <- function (data, p.val,n_groups){
data %>%
dplyr::filter(p_value <= p.val) %>%
group_by(term_id) %>%
summarize(n= n()) %>%
dplyr::filter(n >= n_groups)
}
gost.results.top <- run_GO(dat.Top)
enriched.terms.top <- find_enrichments(gost.results.top)
highTerms.overlap.top <- enriched.terms.top %>%
bind_rows(.id = "species") %>%
dplyr::filter(species != "Ce") %>%
find_common_enrichments(.,0.001, 4)
gostplot(gost.results.top$Ss, interactive = T, capped = T)
gostplot(gost.results.top$Sr, interactive = T, capped = T)
gostplot(gost.results.top$Sp, interactive = T, capped = T)
gostplot(gost.results.top$Sv, interactive = T, capped = T)
gostplot(gost.results.top$Ce, interactive = T, capped = T)
GO terms that are commonly enriched in the top percentile of codon adapted genes in all Strongyloides species tested:
dplyr::filter(gost.results.top$Ss$result, term_id %in% highTerms.overlap.top$term_id) %>%
dplyr::select(term_id, term_name, intersection)
How many of the common Strongyloides go terms are found in the C. elegans terms?
dplyr::filter(gost.results.top$Ce$result, term_id %in% highTerms.overlap.top$term_id) %>%
dplyr::select(term_id, term_name)
How many are unique to the Strongyloides species?
temp <- dplyr::filter(highTerms.overlap.top,
!term_id %in% gost.results.top$Ce$result$term_id)
dplyr::filter(gost.results.top$Ss$result, term_id %in% temp$term_id) %>%
dplyr::select(term_id, term_name)
Take lists of the bottom percentile of codon adapted genes in each species, and run GO analysis to see if these genes are enriched for anything interesting.
# Carry out GO enrichment using gProfiler2 ----
gost.results.bot <- run_GO(dat.Bot)
enriched.terms.bot <- find_enrichments(gost.results.bot)
highTerms.overlap.bot <- enriched.terms.bot %>%
bind_rows(.id = "species") %>%
dplyr::filter(species != "Ce") %>%
find_common_enrichments(.,0.001, 4)
gostplot(gost.results.bot$Ss, interactive = T, capped = T)
gostplot(gost.results.bot$Sr, interactive = T, capped = T)
gostplot(gost.results.bot$Sp, interactive = T, capped = T)
gostplot(gost.results.bot$Sv, interactive = T, capped = T)
gostplot(gost.results.bot$Ce, interactive = T, capped = T)
GO terms that are commonly enriched in the bottom percentile of codon adapted genes in all Strongyloides species tested:
dplyr::filter(gost.results.bot$Ss$result, term_id %in% highTerms.overlap.bot$term_id) %>%
dplyr::select(term_id, term_name)
How many of the common Strongyloides go terms are found in the C. elegans terms?
dplyr::filter(gost.results.bot$Ce$result, term_id %in% highTerms.overlap.bot$term_id) %>%
dplyr::select(term_id, term_name)
How many are unique to the Strongyloides species?
temp <- dplyr::filter(highTerms.overlap.bot,
!term_id %in% gost.results.bot$Ce$result$term_id)
dplyr::filter(gost.results.bot$Ss$result, term_id %in% temp$term_id) %>%
dplyr::select(term_id, term_name)
dat.ATrich <- species.specific.dat.df %>%
#dplyr::filter(species != "Ce") %>%
dplyr::mutate(Description = factor("Top")) %>%
dplyr::group_by(Description, species) %>%
dplyr::slice_max(order_by = desc(GC), prop = percentile) %>%
dplyr::mutate(species = factor(species, levels = c("Ss", "Sr", "Sp", "Sv","Ce")))
How many genes in the AT-rich subset are also included in the subset of genes highly adapted for Strongyloides expression?
common.sequences <- inner_join(dat.ATrich, dat.Top) %>%
tally()
tally(dat.Top) %>%
dplyr::filter(species != "Ce") %>%
dplyr::mutate(ratio_shared_seq = common.sequences$n/n)
# Go analysis
gost.results.ATrich <- run_GO(dat.ATrich)
enriched.terms.ATrich <- find_enrichments(gost.results.ATrich)
highTerms.overlap.ATrich <- enriched.terms.ATrich %>%
bind_rows(.id = "species") %>%
dplyr::filter(species != "Ce") %>%
find_common_enrichments(.,0.001, 4)
What GO terms are enriched in AT-rich genes?
dplyr::filter(gost.results.ATrich$Ss$result, term_id %in% highTerms.overlap.ATrich$term_id) %>%
dplyr::select(term_id, term_name, intersection)
How many GO terms are enriched in both AT-rich sequences and most Str-codon-adapted sequences?
#
temp <- dplyr::filter(highTerms.overlap.top,
term_id %in% highTerms.overlap.ATrich$term_id)
dplyr::filter(gost.results.ATrich$Ss$result, term_id %in% temp$term_id) %>%
dplyr::select(term_id, term_name)
How many GO terms are enriched in both AT-rich sequences and least Str-codon-adapted sequences?
temp <- dplyr::filter(highTerms.overlap.bot,
term_id %in% highTerms.overlap.ATrich$term_id)
dplyr::filter(gost.results.ATrich$Ss$result, term_id %in% temp$term_id) %>%
dplyr::select(term_id, term_name)
dat.GCrich <- species.specific.dat.df %>%
#dplyr::filter(species != "Ce") %>%
dplyr::mutate(Description = factor("Bot")) %>%
dplyr::group_by(Description, species) %>%
dplyr::slice_max(order_by = GC, prop = percentile) %>%
dplyr::mutate(species = factor(species, levels = c("Ss", "Sr", "Sp", "Sv","Ce")))
How many genes in the GC-rich subset are also included in the subset of genes poorly adapted for Strongyloides expression?
common.sequences <- inner_join(dat.GCrich, dat.Bot) %>%
dplyr::filter(species != "Ce") %>%
tally()
tally(dat.Bot) %>%
dplyr::filter(species != "Ce") %>%
dplyr::mutate(ratio_shared_seq = common.sequences$n/n)
# Go analysis
gost.results.GCrich <- run_GO(dat.GCrich)
enriched.terms.GCrich <- find_enrichments(gost.results.GCrich)
highTerms.overlap.GCrich <- enriched.terms.GCrich %>%
bind_rows(.id = "species") %>%
dplyr::filter(species != "Ce") %>%
find_common_enrichments(.,0.001, 4)
What GO terms are enriched in GC-rich genes?
dplyr::filter(gost.results.GCrich$Ss$result, term_id %in% highTerms.overlap.GCrich$term_id) %>%
dplyr::select(term_id, term_name, intersection)
How many GO terms are enriched in both GC-rich sequences and least Str-codon-adapted sequences?
#Overlap btwn GO terms enriched in GC-rich sequences and least Str-codon-adapted sequences
temp <- dplyr::filter(highTerms.overlap.bot,
term_id %in% highTerms.overlap.GCrich$term_id)
dplyr::filter(gost.results.GCrich$Ss$result, term_id %in% temp$term_id) %>%
dplyr::select(term_id, term_name)
How many GO terms are enriched in both GC-rich sequences and highest Str-codon-adapted sequences?
#Overlap btwn GO terms enriched in GC-rich sequences and highest Str-codon-adapted sequences
temp <- dplyr::filter(highTerms.overlap.top,
term_id %in% highTerms.overlap.GCrich$term_id)
dplyr::filter(gost.results.GCrich$Ss$result, term_id %in% temp$term_id) %>%
dplyr::select(term_id, term_name)
Generate lists of genes with the highest expression in free-living females in each species, and run GO analysis to see if these genes are enriched for anything interesting. Here, common GO terms are defined as terms that appear in 3 or 4 species, because the results from S. papillosus are divergent from the others.
gost.results.highexp <- lapply(species_names, function(x) {
GO.geneID <- dat.expression %>%
ungroup() %>%
dplyr::filter(species == x) %>%
dplyr::filter(stage == "FLF") %>%
dplyr::slice_max(order_by = expression, prop = percentile)
organism = case_when (x == "Ss" ~ "ststerprjeb528",
x == "Sr" ~ "strattprjeb125",
x == "Sp" ~ "stpapiprjeb525",
x == "Sv" ~ "stveneprjeb530",
x == "Ce" ~ "caelegprjna13758")
gost.res <- gost(GO.geneID$geneID,
organism = organism,
evcodes = TRUE,
correction_method = "fdr")
})
names(gost.results.highexp) <- species_names
enriched.terms.highexp <- find_enrichments(gost.results.highexp)
highTerms.overlap.highexp <- enriched.terms.highexp %>%
bind_rows(.id = "species") %>%
dplyr::filter(species != "Ce") %>%
find_common_enrichments(.,0.001, 3)
What GO terms are commonly enriched in highly expressed genes in 3 of 4 species?
dplyr::filter(gost.results.highexp$Ss$result, term_id %in% highTerms.overlap.highexp$term_id) %>%
dplyr::select(term_id, term_name, intersection)
How many GO terms are enriched in both highly expressed sequences and highest Str-codon-adapted sequences?
#Overlap btwn GO terms enriched in highly expressed sequences and highest Str-codon-adapted sequences
temp <- dplyr::filter(highTerms.overlap.top,
term_id %in% highTerms.overlap.highexp$term_id)
dplyr::filter(gost.results.highexp$Ss$result, term_id %in% temp$term_id) %>%
dplyr::select(term_id, term_name)
How many GO terms are enriched in both highly expressed sequences and poorest Str-codon-adapted sequences?
#Overlap btwn GO terms enriched in highly expressed sequences and poorest Str-codon-adapted sequences
temp <- dplyr::filter(highTerms.overlap.bot,
term_id %in% highTerms.overlap.highexp$term_id)
dplyr::filter(gost.results.highexp$Ss$result, term_id %in% temp$term_id) %>%
dplyr::select(term_id, term_name)
Generate lists of genes with lowest expression in free-living females in each species, and run GO analysis to see if these genes are enriched for anything interesting. Here, common GO terms are defined as terms that appear in 3 or 4 species, because the results from S. papillosus are divergent from the others.
gost.results.lowexp <- lapply(species_names, function(x) {
GO.geneID <- dat.expression %>%
ungroup() %>%
dplyr::filter(species == x) %>%
dplyr::filter(stage == "FLF") %>%
dplyr::slice_max(order_by = desc(expression), prop = percentile)
organism = case_when (x == "Ss" ~ "ststerprjeb528",
x == "Sr" ~ "strattprjeb125",
x == "Sp" ~ "stpapiprjeb525",
x == "Sv" ~ "stveneprjeb530",
x == "Ce" ~ "caelegprjna13758")
gost.res <- gost(GO.geneID$geneID,
organism = organism,
evcodes = TRUE,
correction_method = "fdr")
})
names(gost.results.lowexp) <- species_names
enriched.terms.lowexp <- find_enrichments(gost.results.lowexp)
highTerms.overlap.lowexp <- enriched.terms.lowexp %>%
bind_rows(.id = "species") %>%
dplyr::filter(species != "Ce") %>%
find_common_enrichments(.,0.001, 3)
What GO terms are commonly enriched in poorly expressed genes in 3 of 4 species?
dplyr::filter(gost.results.lowexp$Ss$result, term_id %in% highTerms.overlap.lowexp$term_id) %>%
dplyr::select(term_id, term_name, intersection)
How many GO terms are enriched in both poorly expressed sequences and highest Str-codon-adapted sequences?
#Overlap btwn GO terms enriched in poorly expressed sequences and highest Str-codon-adapted sequences
temp <- dplyr::filter(highTerms.overlap.top,
term_id %in% highTerms.overlap.lowexp$term_id)
dplyr::filter(gost.results.lowexp$Ss$result, term_id %in% temp$term_id) %>%
dplyr::select(term_id, term_name)
How many GO terms are enriched in both poorly expressed sequences and poorest Str-codon-adapted sequences?
#Overlap btwn GO terms enriched in poorly expressed sequences and poorest Str-codon-adapted sequences
temp <- dplyr::filter(highTerms.overlap.bot,
term_id %in% highTerms.overlap.lowexp$term_id)
dplyr::filter(gost.results.lowexp$Ss$result, term_id %in% temp$term_id) %>%
dplyr::select(term_id, term_name)
Saving data and plots generated by the above code.
# Save distribution plot of codon adaptivness by species
ggsave('../Outputs/Codon_Adaptiveness_Distributions_by_Species.pdf',
plot = species_adaptiveness_plot,
width = 5.5, height = 4,
units = "in", device = "pdf",
useDingbats = FALSE)
# Save distribution plot of GC content by species
ggsave('../Outputs/GC_Distributions_by_Species.pdf',
plot = species_GC_plot,
width = 5.5, height = 4,
units = "in", device = "pdf",
useDingbats = FALSE)
# Save plot of CAI vs FLF expression level
ggsave('../Outputs/CAIvsFLFlog2CPM_by_Species.pdf',
plot = caiExp.highExp_plot,
width = 5.5, height = 3,
units = "in", device = "pdf",
useDingbats = FALSE)
write_excel <- function(data, sheet_facet, filename){
file <- paste(filename,".xlsx",sep = "")
# Workbook
to_download <<- createWorkbook()
sapply(seq_along(sheet_facet), function(x){
addWorksheet(wb = to_download, sheetName = sheet_facet[[x]])
sheet_data <- data %>%
dplyr::filter(species == sheet_facet[[x]]) %>%
dplyr::select(!species)
# # Write Data
# ## Sheet header
# writeData(
# to_download,
# sheet = 1,
# x = c(
# paste0("Strongyloides Codon Usage Report"),
# paste0("Report generated on ", format(Sys.Date(), "%B %d, %Y"))
# )
# )
#
## Results of codon usage analysis
writeData(
to_download,
sheet = x,
x = sheet_data,
startRow = 1,
startCol = 1,
headerStyle = createStyle(
textDecoration = "Bold",
halign = "center",
border = "bottom"
)
)
})
saveWorkbook(to_download, file, overwrite=TRUE)
}
# Save files with top and bottom percentiles of codon adapted genes per species
tbl %>%
dplyr::select(geneID, species_CAI, GC, Description, species) %>%
write_excel(levels(tbl$species), '../Outputs/Percentile_CAI_Genes')
# Save list of all GO terms for each species, and the genes id that intersect
# between the GO term and the query
# for the top and bottom percentile of codon adapted genes
enriched.GO.terms <- rbind(
lowCAI = lapply(names(gost.results.bot), function(y){
gost.results.bot[[y]]$result %>%
as_tibble() %>%
dplyr::select(-c(query, significant,parents)) %>%
dplyr::arrange(p_value) %>%
dplyr::mutate(species = y)
}) %>% bind_rows %>%
dplyr::mutate(species = factor(species, levels = c("Ss", "Sr", "Sp", "Sv","Ce"))) %>%
dplyr::mutate(Description = "Least-adapted genes") %>%
dplyr::relocate(Description,species,term_id, source, term_name, .before = everything()),
highCAI = lapply(names(gost.results.top), function(y){
gost.results.top[[y]]$result %>%
as_tibble() %>%
dplyr::select(-c(query, significant,parents)) %>%
dplyr::arrange(p_value) %>%
dplyr::mutate(species = y)
}) %>% bind_rows %>%
dplyr::mutate(species = factor(species, levels = c("Ss", "Sr", "Sp", "Sv","Ce"))) %>%
dplyr::mutate(Description = "Most-adapted genes") %>%
dplyr::relocate(Description,species,term_id, source, term_name, .before = everything())
)
write_excel(enriched.GO.terms, levels(enriched.GO.terms$species), '../Outputs/Percentile_GO_Analysis')
# Save list of common GO terms for each species, and the genes id that intersect
# between the GO term and the query
# for the top and bottom percentile of codon adapted genes and GT-rich and AT-rich subsets too
common.terms <- rbind (lowCAI = filter(enriched.GO.terms, term_id %in% highTerms.overlap.bot$term_id) %>%
dplyr::filter(species == "Ss") %>%
dplyr::select(term_id, source,term_name, Description),
highCAI = filter(enriched.GO.terms, term_id %in% highTerms.overlap.top$term_id) %>%
dplyr::filter(species == "Ss") %>%
dplyr::select(term_id, source, term_name, Description),
ATrich = dplyr::filter(gost.results.ATrich$Ss$result, term_id %in% highTerms.overlap.ATrich$term_id) %>%
dplyr::select(term_id, source, term_name) %>%
dplyr::mutate(Description = "AT-rich genes"),
GCrich = dplyr::filter(gost.results.GCrich$Ss$result, term_id %in% highTerms.overlap.GCrich$term_id) %>%
dplyr::select(term_id, source, term_name)%>%
dplyr::mutate(Description = "GC-rich genes")
) %>%
dplyr::mutate(Description = factor(Description)) %>%
dplyr::rename(species = Description)
write_excel(common.terms, levels(common.terms$species), '../Outputs/Commonly_Enriched_GO_Terms')
# Save Functional Enrichment Plots for bottom percentile
lapply(names(gost.results.bot), function(y){
gost.res <- gost.results.bot[[y]]
gost.ggplot <- gostplot(gost.res, interactive = F, capped = T) +
labs(title = paste(y, "Bottom", percentile*100,"% Codon Adapted Genes")) +
theme(plot.title.position = "plot",
plot.caption.position = "plot",
plot.title = element_text(face = "bold",
size = 8, hjust = 0))
gost.ggplot.pub<- publish_gostplot(gost.ggplot,
highlight_terms = highTerms.overlap.bot$term_id)
ggsave(paste0('../Outputs/',y,'_Bot_CAI_Plot.pdf'), plot = gost.ggplot.pub,
width = 7, height = 4,
units = "in", device = "pdf",
useDingbats = FALSE)
})
## [[1]]
## NULL
##
## [[2]]
## NULL
##
## [[3]]
## NULL
##
## [[4]]
## NULL
##
## [[5]]
## NULL
# Save Functional Enrichment Plots for top percentile
lapply(names(gost.results.top), function(y){
gost.res <- gost.results.top[[y]]
gost.ggplot <- gostplot(gost.res, interactive = F, capped = T) +
labs(title = paste(y, "Top", percentile*100,"% Codon Adapted Genes")) +
theme(plot.title.position = "plot",
plot.caption.position = "plot",
plot.title = element_text(face = "bold",
size = 8, hjust = 0))
gost.ggplot.pub<- publish_gostplot(gost.ggplot,
highlight_terms = highTerms.overlap.top$term_id)
ggsave(paste0('../Outputs/',y,'_Top_CAI_Plot.pdf'), plot = gost.ggplot.pub,
width = 7, height = 5,
units = "in", device = "pdf",
useDingbats = FALSE)
})
## [[1]]
## NULL
##
## [[2]]
## NULL
##
## [[3]]
## NULL
##
## [[4]]
## NULL
##
## [[5]]
## NULL
suppressPackageStartupMessages({
library(knitr)
library(rmarkdown)
library(tidyverse)
library(readxl)
library(magrittr)
library(DescTools)
library(ggthemes)
library(biomaRt)
library(ggplot2)
library(openxlsx)
library(wrapr)
library(Matching)
library(cowplot)
library(gprofiler2)
source('fetch_cDNAseq.R')
})
knitr::opts_chunk$set(echo = TRUE, message = FALSE, warning = FALSE)
# Check for presence of output plots folder, generate if it doesn't exist
output.path <- "../Outputs"
if (!dir.exists(output.path)){
dir.create(output.path)
}
if (!file.exists(c("../Data/tidy_genomic_CAI.csv"))) {
temp <- c(Ss = '../Data/Ss_Codon_Usage_Report.xlsx',
Sr = '../Data/Sr_Codon_Usage_Report.xlsx',
Sp = '../Data/Sp_Codon_Usage_Report.xlsx',
Sv = '../Data/Sv_Codon_Usage_Report.xlsx',
Bm = '../Data/Bm_Codon_Usage_Report.xlsx',
Ce = '../Data/Ce_Codon_Usage_Report.xlsx')
dat.genomic <- sapply(temp, read_excel,
sheet = 1,
skip = 3,
col_names = T,
simplify = F,
USE.NAMES = T)
# This data contains transcript level information.
# Commented code would trim rows such that in cases where there are
# multiple transcripts per gene, only
# values from the longest transcript would be included.
# This was originally included to permit comparison to gene expression databases
# but is no longer required.
dat.genomic.cleaned <- lapply(dat.genomic, function(x){
x %>%
dplyr::mutate(transcriptID = geneID)
# dplyr::rename(transcriptID = geneID) %>%
# dplyr::mutate(geneID = str_remove_all(transcriptID, "\\.[0-9]$")) %>%
# dplyr::mutate(geneID = str_remove_all(geneID, "[a-c]$")) %>%
# dplyr::group_by(geneID) %>%
# dplyr::slice_max(n = 1, order_by = str_length(`cDNA sequence`),
# with_ties = FALSE) %>%
#dplyr::relocate(geneID, .before = everything())
})
names(dat.genomic.cleaned) <- c("Ss", "Sr", "Sp", "Sv", "Bm", "Ce")
}
if (!file.exists(c("../Data/tidy_genomic_CAI.csv"))) {
dat.genomic.df <- dat.genomic.cleaned %>%
map("geneID") %>%
unlist() %>%
as_tibble_col(column_name = "geneID") %>%
group_by(geneID)
dat.genomic.df <- dat.genomic.cleaned %>%
map("transcriptID") %>%
unlist() %>%
as_tibble_col(column_name = "transcriptID") %>%
add_column(dat.genomic.df, .)
dat.genomic.df <- dat.genomic.cleaned %>%
map("Sr_CAI") %>%
unlist() %>%
as_tibble_col(column_name = "Sr_CAI") %>%
add_column(dat.genomic.df, .)
dat.genomic.df <- dat.genomic.cleaned %>%
map("Bm_CAI") %>%
unlist() %>%
as_tibble_col(column_name = "Bm_CAI") %>%
add_column(dat.genomic.df, .)
dat.genomic.df <- dat.genomic.cleaned %>%
map("Ce_CAI") %>%
unlist() %>%
as_tibble_col(column_name = "Ce_CAI") %>%
add_column(dat.genomic.df, .)
dat.genomic.df <- dat.genomic.cleaned %>%
map("GC") %>%
unlist() %>%
as_tibble_col(column_name = "GC") %>%
add_column(dat.genomic.df, .)
# Add the column that species which species the gene is from
# by matching to the original lists of genes,
# the group
dat.genomic.df <- dat.genomic.df %>%
dplyr::mutate(species = case_when(
transcriptID %in% dat.genomic$Ss$geneID ~ 'Ss',
transcriptID %in% dat.genomic$Sr$geneID ~ 'Sr',
transcriptID %in% dat.genomic$Sp$geneID ~ 'Sp',
transcriptID %in% dat.genomic$Sv$geneID ~ 'Sv',
transcriptID %in% dat.genomic$Bm$geneID ~ 'Bm',
transcriptID %in% dat.genomic$Ce$geneID ~ 'Ce',
))
dat.genomic.df <- group_by(dat.genomic.df, species)
}
# Save a tidy version of the complied genomic CAI value dataset if it doesn't exist
if (!file.exists(c("../Data/tidy_genomic_CAI.csv"))) {
write.table(dat.genomic.df,
file = "../Data/tidy_genomic_CAI.csv",
sep = ",",
col.names = T,
row.names = FALSE)
}
dat.genomic.df <- suppressWarnings(
read.csv("../Data/tidy_genomic_CAI.csv",
header = TRUE)) %>%
as_tibble()
species.specific.dat.df <- dat.genomic.df %>%
dplyr::mutate(species_CAI = case_when(
species == 'Ce' ~ Ce_CAI,
species == 'Bm' ~ Bm_CAI,
TRUE ~ Sr_CAI
)) %>%
dplyr::mutate(outgroup_CAI = case_when(
species == 'Ce' ~ Sr_CAI,
species != 'Ce' ~ Ce_CAI
)) %>%
#dplyr:: select(geneID, species_CAI, species) %>%
group_by(species)
plot_distributions <- function(dat,
x,
title,
xlabel,
caption = "",
ylabel = "Density (scaled to maximum of 1)",
xlim = c(0,1),
ylim = c(0,1)) {
dat %>%
ggplot(aes(x = x, y = after_stat(ndensity), color = species)) +
geom_freqpoly(bins = 50) +
scale_color_manual(values = c("seagreen4", "grey4","steelblue4",
"coral4", "darkgoldenrod4",
"darkorchid4"),
name = "Species",
labels = c("B. malayi","C. elegans", "S. papillosus",
"S. ratti", "S. stercoralis",
"S. venezuelensis")) +
labs(title = title,
subtitle = "Data includes all predicted coding sequences",
x = xlabel,
y = ylabel,
caption = caption) +
coord_equal(xlim = xlim, ylim = ylim) +
theme_bw() +
theme(plot.title.position = "plot",
plot.caption.position = "panel",
plot.title = element_text(face = "bold",
size = 10, hjust = 0),
plot.subtitle = element_text(size = 8),
legend.title = element_text(size = 8),
axis.title = element_text(size = 8),
axis.text = element_text(size = 8),
axis.text.x = element_text(angle = 45, hjust = 1))
}
species_adaptiveness_plot <- plot_distributions(species.specific.dat.df,
species.specific.dat.df$species_CAI,
"Species-wide codon adaptiveness",
"Codon bias relative to \n genus coding usage rules (CAI)",
"Strongyloides species values relative to S. ratti usage rules
C. elegans values relative to C. elegans usage rules.
B. malayi values relative to B. malayi ussage rules.")
species_adaptiveness_plot
# Kruskal-Wallis testing of genomic CAI
one.way <- kruskal.test(species_CAI ~ species, data = species.specific.dat.df)
# Posthoc Dunn tests with Bonferroni adjustment
post.hoc <- dunn.test::dunn.test(species.specific.dat.df$species_CAI, species.specific.dat.df$species, method = "bonferroni")
species.specific.GC.df <- dat.genomic.df %>%
dplyr:: select(geneID, GC, species) %>%
group_by(species)
species_GC_plot <- plot_distributions(species.specific.GC.df,
species.specific.GC.df$GC,
"Species-wide GC content",
"GC content")
species_GC_plot
# Kruskal-Wallis testing of genomic GC
one.way <- kruskal.test(GC ~ species, data = species.specific.GC.df)
# Posthoc Dunn tests with Bonferroni adjustment
post.hoc <- dunn.test::dunn.test(species.specific.GC.df$GC, species.specific.GC.df$species, method = "bonferroni")
percentile = 0.02
# For following analyses, only look at Strongyloides and C. elegans, not B. malayi
species.specific.dat.df<-species.specific.dat.df %>%
dplyr::filter(species != "Bm")
dat.Top <- species.specific.dat.df %>%
#dplyr::filter(species != "Ce") %>%
dplyr::mutate(Description = factor("Top")) %>%
dplyr::group_by(Description, species) %>%
dplyr::slice_max(order_by = species_CAI, prop = percentile) %>%
dplyr::mutate(species = factor(species, levels = c("Ss", "Sr", "Sp", "Sv","Ce")))
tally(dat.Top)
dat.Bot <- species.specific.dat.df %>%
# dplyr::filter(species != "Ce") %>%
dplyr::mutate(Description = factor("Bot")) %>%
dplyr::group_by(Description, species) %>%
dplyr::slice_max(order_by = desc(species_CAI), prop = percentile) %>%
dplyr::mutate(species = factor(species, levels = c("Ss", "Sr", "Sp", "Sv","Ce")))
tally(dat.Bot)
tbl <- rbind(dat.Top, dat.Bot) %>%
mutate(Description = recode(Description, Top = "Most-adapted genes",
Bot = "Least-adapted genes"))
ggplot(tbl, aes(species_CAI,GC, Description)) +
geom_point(data = species.specific.dat.df,
mapping = aes(x = species_CAI, y = GC),color = "grey", shape = 1) +
geom_point(mapping = aes(color = species, shape = Description)) +
#geom_jitter(mapping = aes(color = species), shape = 1) +
coord_fixed (3) +
facet_grid( ~species) +
scale_color_hue (l = 40) +
scale_shape_manual(values = c(1:2)) +
labs(x = "Codon bias relative to genus usage rules",
y = "G+C ratio",
title = "G+C ratio as a function of codon adaptiveness (genus rules)",
subtitle = "grey icons = all genes \n colored icons = most and least adapted genes ") +
theme_bw() +
theme(plot.title.position = "plot",
plot.caption.position = "panel",
plot.title = element_text(face = "bold",
size = 8, hjust = 0),
plot.subtitle = element_text(size = 8),
legend.title = element_text(size = 8),
axis.title = element_text(size = 8),
axis.text = element_text(size = 8),
axis.text.x = element_text(angle = 45, hjust = 1))
# tbl <- pivot_longer(tbl,
# cols = c("species_CAI", "outgroup_CAI"),
# names_to = "value",
# values_to = "CAI"
# )
ggplot(tbl, aes(outgroup_CAI,GC, Description)) +
geom_point(mapping = aes(color = species, shape = Description)) +
#geom_jitter(mapping = aes(color = species), shape = 1) +
coord_equal()+
facet_wrap( ~species) +
scale_color_hue (l = 40) +
scale_shape_manual(values = c(1:2)) +
labs(x = "Codon bias relative to non-genus usage rules",
y = "G+C ratio",
title = "G+C ratio as a function of codon adaptiveness (non-genus rules)") +
theme_bw() +
theme(plot.title.position = "plot",
plot.caption.position = "panel",
plot.title = element_text(face = "bold",
size = 8, hjust = 0),
plot.subtitle = element_text(size = 8),
legend.title = element_text(size = 8),
axis.title = element_text(size = 8),
axis.text = element_text(size = 8))
temp <- c(Ss = '../Data/SsRNAseq_log2cpm_filtered_norm_voom.csv',
Sr = '../Data/SrRNAseq_log2cpm_filtered_norm_voom.csv',
Sp = '../Data/SpRNAseq_log2cpm_filtered_norm_voom.csv',
Sv = '../Data/SvRNAseq_log2cpm_filtered_norm_voom.csv')
dat.expression <- sapply(temp, read.csv,
header = TRUE)
species_names <- names(dat.expression)
dat.Top.expression <- lapply(species_names, function(x) {
dat.Top.select <- dplyr::filter(dat.Top, species == x) %>%
dplyr::mutate(geneID = str_remove_all(geneID, "\\.[0-9]$"))
subset.expression <- dat.expression[[x]] %>%
dplyr::filter(X %in% dat.Top.select$geneID) %>%
dplyr::rename(geneID = X) %>%
pivot_longer(cols = -geneID,
names_to = "samples",
values_to = "expression") %>%
tidyr::separate(samples,c("stage", NA)) %>%
dplyr::group_by(geneID, stage) %>%
dplyr::summarize(expression = median(expression))
})
names(dat.Top.expression) <- species_names
dat.Top.expression <- map_dfr(dat.Top.expression, bind_rows, .id = "species")
dat.Bot.expression <- lapply(species_names, function(x) {
dat.Bot.select <- dplyr::filter(dat.Bot, species == x) %>%
dplyr::mutate(geneID = str_remove_all(geneID, "\\.[0-9]$"))
subset.expression <- dat.expression[[x]] %>%
dplyr::filter(X %in% dat.Bot.select$geneID) %>%
dplyr::rename(geneID = X) %>%
pivot_longer(cols = -geneID,
names_to = "samples",
values_to = "expression") %>%
tidyr::separate(samples,c("stage", NA)) %>%
dplyr::group_by(geneID, stage) %>%
dplyr::summarize(expression = median(expression))
})
names(dat.Bot.expression) <- species_names
dat.Bot.expression <- map_dfr(dat.Bot.expression, bind_rows, .id = "species")
dat.expression <- lapply(species_names, function(x) {
dat.expression[[x]] %>%
dplyr::rename(geneID = X) %>%
pivot_longer(cols = -geneID,
names_to = "samples",
values_to = "expression") %>%
tidyr::separate(samples,c("stage", NA)) %>%
dplyr::group_by(geneID, stage) %>%
dplyr::summarize(expression = median(expression))
})
names(dat.expression) <- species_names
dat.expression <- map_dfr(dat.expression, bind_rows, .id = "species")
lapply(species_names, function(x){
plotdata.subset1 <- filter(dat.Top.expression, species == x)
plotdata.subset2 <- filter(dat.Bot.expression, species == x)
plotdata.all <- filter(dat.expression, species == x)
plot.df <- bind_rows(HighCAI = plotdata.subset1,
LowCAI = plotdata.subset2,
AllGenes = plotdata.all, .id = "Description")
p.Bot.expression <- ggplot(plot.df) +
aes(x=stage, y=expression, color = Description, fill = Description) +
geom_violin(show.legend = T,
alpha = .5,
position = position_dodge(width = 1),
trim = FALSE) +
scale_color_manual(values = c("black", "coral4", "steelblue4")) +
scale_fill_manual(values = c("grey", "coral4", "steelblue4")) +
stat_summary(fun = "median",
geom = "point",
shape = 20,
size = 2,
position = position_dodge(width = 1),
show.legend = FALSE) +
facet_grid(~stage, scales = "free") +
labs(y="log2CPM Expression", x = "Life Stage",
title = paste0(x, " - log2 Counts per Million (CPM)"),
caption=paste0("produced on ", Sys.time())) +
theme_bw()
suppressMessages(ggsave(paste0(x,"_CAIExpression.pdf"),
plot = p.Bot.expression,
device = "pdf",
height = 4,
width = 7,
path = output.path,
useDingbats=FALSE))
p.Bot.expression
})
temp <- lapply(species_names, function(x) {
dat.select <- dplyr::filter(species.specific.dat.df,
species == x) %>%
dplyr::mutate(geneID = str_remove_all(geneID, "\\.[0-9]$")) %>%
dplyr::ungroup() %>%
dplyr::select(geneID, species_CAI)
subset.expression <- dat.expression %>%
dplyr::filter(species == x)%>%
dplyr::filter(stage == "FLF") %>%
dplyr::filter(geneID %in% dat.select$geneID)
full_join(dat.select, subset.expression, by = "geneID") %>%
dplyr::select(!stage)
})
names(temp) <- species_names
caiExp.df <- map_dfr(temp, bind_rows, .id = "species")
caiExp.df <- bind_rows(AllGenes = caiExp.df,
HighCAI = dplyr::filter(caiExp.df, geneID %in% dat.Top.expression$geneID),
LowCAI = dplyr::filter(caiExp.df, geneID %in% dat.Bot.expression$geneID),
.id = "Description") %>%
dplyr::mutate(species = as_factor(species))
caiExp.highExp_plot <- ggplot(caiExp.df) +
aes(x=Description, y=expression, color = Description, fill = Description) +
geom_violin(show.legend = T,
position = position_dodge(width = 1),
trim = FALSE,
color = 'black') +
#scale_color_manual(values = c("black", "coral4", "steelblue4")) +
scale_fill_manual(values = c("grey", "coral4", "steelblue4")) +
stat_summary(fun = "median",
geom = "point",
shape = 20,
size = 2,
position = position_dodge(width = 1),
color = "black",
show.legend = FALSE) +
facet_grid( ~species) +
labs(y="log2CPM Expression", x = "Gene subsets (base on CAI value) by species",
title = "log2 Counts per Million (CPM) in free-living female as a function of codon bias",
subtitle = "grey = all genes \ncolors = top/bottom 2% CAI",
caption=paste0("produced on ", Sys.time())) +
theme_bw() +
theme(plot.title.position = "plot",
plot.caption.position = "panel",
plot.title = element_text(face = "bold",
size = 8, hjust = 0),
plot.subtitle = element_text(size = 8),
legend.title = element_text(size = 8),
axis.title = element_text(size = 8),
axis.text = element_text(size = 8),
axis.text.x = element_text(angle = 45, hjust = 1))
caiExp.highExp_plot
suppressPackageStartupMessages({library(car)})
cpm.merged <- rbind(dat.expression %>% dplyr::mutate(condition = "AllGenes"),
dat.Top.expression %>% dplyr::mutate(condition = "HighCPM"),
dat.Bot.expression %>% dplyr::mutate(condition = "LowCPM")) %>%
dplyr::mutate(condition = as_factor(condition))%>%
group_by(species, geneID, condition, stage)
options(contrasts = c("contr.sum", "contr.poly"))
lapply(species_names, function(x) {
cpm.merged.sub <- filter(cpm.merged, species == x)
res.aov2<- aov(expression ~ condition * stage, data = cpm.merged.sub)
Anova(res.aov2, type = "III")
res.aov3<-TukeyHSD(res.aov2, "condition:stage")
as_tibble(res.aov3$`condition:stage`, rownames = "comparison") %>%
separate(col = comparison, into = c("comp1", "LS1", "comp2", "LS2")) %>%
dplyr::filter(LS1 == LS2) %>%
unite(temp1, comp1, LS1, sep = ":")%>%
unite(temp2, comp2, LS2, sep = ":")%>%
unite(comparison, temp1, temp2, sep = "-")
})
# Carry out GO enrichment using gProfiler2 ----
run_GO <- function (data){
GO.geneID <- data %>%
ungroup() %>%
group_by(species) %>%
dplyr::group_map(~ {
gost.res <- .x %>%
dplyr::select(geneID, GC) %>%
unique()
}, .keep = TRUE)
names(GO.geneID) <- levels(data$species)
gost.results <- lapply(names(GO.geneID), function(y){
organism = case_when (y == "Ss" ~ "ststerprjeb528",
y == "Sr" ~ "strattprjeb125",
y == "Sp" ~ "stpapiprjeb525",
y == "Sv" ~ "stveneprjeb530",
y == "Ce" ~ "caelegprjna13758")
gost.res <- gost(GO.geneID[[y]]$geneID,
organism = organism,
evcodes = TRUE,
correction_method = "fdr")
})
names(gost.results) <- names(GO.geneID)
gost.results
}
find_enrichments <- function (gost.results) {
# Find GO terms that are enriched with p-values equal or small than a threshold.
enriched.terms <- lapply(names(gost.results), function(y){
gost.results[[y]]$result %>%
as_tibble() %>%
dplyr::select(term_id, p_value)
})
names(enriched.terms) <- names(gost.results)
enriched.terms
}
find_common_enrichments <- function (data, p.val,n_groups){
data %>%
dplyr::filter(p_value <= p.val) %>%
group_by(term_id) %>%
summarize(n= n()) %>%
dplyr::filter(n >= n_groups)
}
gost.results.top <- run_GO(dat.Top)
enriched.terms.top <- find_enrichments(gost.results.top)
highTerms.overlap.top <- enriched.terms.top %>%
bind_rows(.id = "species") %>%
dplyr::filter(species != "Ce") %>%
find_common_enrichments(.,0.001, 4)
gostplot(gost.results.top$Ss, interactive = T, capped = T)
gostplot(gost.results.top$Sr, interactive = T, capped = T)
gostplot(gost.results.top$Sp, interactive = T, capped = T)
gostplot(gost.results.top$Sv, interactive = T, capped = T)
gostplot(gost.results.top$Ce, interactive = T, capped = T)
dplyr::filter(gost.results.top$Ss$result, term_id %in% highTerms.overlap.top$term_id) %>%
dplyr::select(term_id, term_name, intersection)
dplyr::filter(gost.results.top$Ce$result, term_id %in% highTerms.overlap.top$term_id) %>%
dplyr::select(term_id, term_name)
temp <- dplyr::filter(highTerms.overlap.top,
!term_id %in% gost.results.top$Ce$result$term_id)
dplyr::filter(gost.results.top$Ss$result, term_id %in% temp$term_id) %>%
dplyr::select(term_id, term_name)
# Carry out GO enrichment using gProfiler2 ----
gost.results.bot <- run_GO(dat.Bot)
enriched.terms.bot <- find_enrichments(gost.results.bot)
highTerms.overlap.bot <- enriched.terms.bot %>%
bind_rows(.id = "species") %>%
dplyr::filter(species != "Ce") %>%
find_common_enrichments(.,0.001, 4)
gostplot(gost.results.bot$Ss, interactive = T, capped = T)
gostplot(gost.results.bot$Sr, interactive = T, capped = T)
gostplot(gost.results.bot$Sp, interactive = T, capped = T)
gostplot(gost.results.bot$Sv, interactive = T, capped = T)
gostplot(gost.results.bot$Ce, interactive = T, capped = T)
dplyr::filter(gost.results.bot$Ss$result, term_id %in% highTerms.overlap.bot$term_id) %>%
dplyr::select(term_id, term_name)
dplyr::filter(gost.results.bot$Ce$result, term_id %in% highTerms.overlap.bot$term_id) %>%
dplyr::select(term_id, term_name)
temp <- dplyr::filter(highTerms.overlap.bot,
!term_id %in% gost.results.bot$Ce$result$term_id)
dplyr::filter(gost.results.bot$Ss$result, term_id %in% temp$term_id) %>%
dplyr::select(term_id, term_name)
dat.ATrich <- species.specific.dat.df %>%
#dplyr::filter(species != "Ce") %>%
dplyr::mutate(Description = factor("Top")) %>%
dplyr::group_by(Description, species) %>%
dplyr::slice_max(order_by = desc(GC), prop = percentile) %>%
dplyr::mutate(species = factor(species, levels = c("Ss", "Sr", "Sp", "Sv","Ce")))
common.sequences <- inner_join(dat.ATrich, dat.Top) %>%
tally()
tally(dat.Top) %>%
dplyr::filter(species != "Ce") %>%
dplyr::mutate(ratio_shared_seq = common.sequences$n/n)
# Go analysis
gost.results.ATrich <- run_GO(dat.ATrich)
enriched.terms.ATrich <- find_enrichments(gost.results.ATrich)
highTerms.overlap.ATrich <- enriched.terms.ATrich %>%
bind_rows(.id = "species") %>%
dplyr::filter(species != "Ce") %>%
find_common_enrichments(.,0.001, 4)
dplyr::filter(gost.results.ATrich$Ss$result, term_id %in% highTerms.overlap.ATrich$term_id) %>%
dplyr::select(term_id, term_name, intersection)
#
temp <- dplyr::filter(highTerms.overlap.top,
term_id %in% highTerms.overlap.ATrich$term_id)
dplyr::filter(gost.results.ATrich$Ss$result, term_id %in% temp$term_id) %>%
dplyr::select(term_id, term_name)
temp <- dplyr::filter(highTerms.overlap.bot,
term_id %in% highTerms.overlap.ATrich$term_id)
dplyr::filter(gost.results.ATrich$Ss$result, term_id %in% temp$term_id) %>%
dplyr::select(term_id, term_name)
dat.GCrich <- species.specific.dat.df %>%
#dplyr::filter(species != "Ce") %>%
dplyr::mutate(Description = factor("Bot")) %>%
dplyr::group_by(Description, species) %>%
dplyr::slice_max(order_by = GC, prop = percentile) %>%
dplyr::mutate(species = factor(species, levels = c("Ss", "Sr", "Sp", "Sv","Ce")))
common.sequences <- inner_join(dat.GCrich, dat.Bot) %>%
dplyr::filter(species != "Ce") %>%
tally()
tally(dat.Bot) %>%
dplyr::filter(species != "Ce") %>%
dplyr::mutate(ratio_shared_seq = common.sequences$n/n)
# Go analysis
gost.results.GCrich <- run_GO(dat.GCrich)
enriched.terms.GCrich <- find_enrichments(gost.results.GCrich)
highTerms.overlap.GCrich <- enriched.terms.GCrich %>%
bind_rows(.id = "species") %>%
dplyr::filter(species != "Ce") %>%
find_common_enrichments(.,0.001, 4)
dplyr::filter(gost.results.GCrich$Ss$result, term_id %in% highTerms.overlap.GCrich$term_id) %>%
dplyr::select(term_id, term_name, intersection)
#Overlap btwn GO terms enriched in GC-rich sequences and least Str-codon-adapted sequences
temp <- dplyr::filter(highTerms.overlap.bot,
term_id %in% highTerms.overlap.GCrich$term_id)
dplyr::filter(gost.results.GCrich$Ss$result, term_id %in% temp$term_id) %>%
dplyr::select(term_id, term_name)
#Overlap btwn GO terms enriched in GC-rich sequences and highest Str-codon-adapted sequences
temp <- dplyr::filter(highTerms.overlap.top,
term_id %in% highTerms.overlap.GCrich$term_id)
dplyr::filter(gost.results.GCrich$Ss$result, term_id %in% temp$term_id) %>%
dplyr::select(term_id, term_name)
gost.results.highexp <- lapply(species_names, function(x) {
GO.geneID <- dat.expression %>%
ungroup() %>%
dplyr::filter(species == x) %>%
dplyr::filter(stage == "FLF") %>%
dplyr::slice_max(order_by = expression, prop = percentile)
organism = case_when (x == "Ss" ~ "ststerprjeb528",
x == "Sr" ~ "strattprjeb125",
x == "Sp" ~ "stpapiprjeb525",
x == "Sv" ~ "stveneprjeb530",
x == "Ce" ~ "caelegprjna13758")
gost.res <- gost(GO.geneID$geneID,
organism = organism,
evcodes = TRUE,
correction_method = "fdr")
})
names(gost.results.highexp) <- species_names
enriched.terms.highexp <- find_enrichments(gost.results.highexp)
highTerms.overlap.highexp <- enriched.terms.highexp %>%
bind_rows(.id = "species") %>%
dplyr::filter(species != "Ce") %>%
find_common_enrichments(.,0.001, 3)
dplyr::filter(gost.results.highexp$Ss$result, term_id %in% highTerms.overlap.highexp$term_id) %>%
dplyr::select(term_id, term_name, intersection)
#Overlap btwn GO terms enriched in highly expressed sequences and highest Str-codon-adapted sequences
temp <- dplyr::filter(highTerms.overlap.top,
term_id %in% highTerms.overlap.highexp$term_id)
dplyr::filter(gost.results.highexp$Ss$result, term_id %in% temp$term_id) %>%
dplyr::select(term_id, term_name)
#Overlap btwn GO terms enriched in highly expressed sequences and poorest Str-codon-adapted sequences
temp <- dplyr::filter(highTerms.overlap.bot,
term_id %in% highTerms.overlap.highexp$term_id)
dplyr::filter(gost.results.highexp$Ss$result, term_id %in% temp$term_id) %>%
dplyr::select(term_id, term_name)
gost.results.lowexp <- lapply(species_names, function(x) {
GO.geneID <- dat.expression %>%
ungroup() %>%
dplyr::filter(species == x) %>%
dplyr::filter(stage == "FLF") %>%
dplyr::slice_max(order_by = desc(expression), prop = percentile)
organism = case_when (x == "Ss" ~ "ststerprjeb528",
x == "Sr" ~ "strattprjeb125",
x == "Sp" ~ "stpapiprjeb525",
x == "Sv" ~ "stveneprjeb530",
x == "Ce" ~ "caelegprjna13758")
gost.res <- gost(GO.geneID$geneID,
organism = organism,
evcodes = TRUE,
correction_method = "fdr")
})
names(gost.results.lowexp) <- species_names
enriched.terms.lowexp <- find_enrichments(gost.results.lowexp)
highTerms.overlap.lowexp <- enriched.terms.lowexp %>%
bind_rows(.id = "species") %>%
dplyr::filter(species != "Ce") %>%
find_common_enrichments(.,0.001, 3)
dplyr::filter(gost.results.lowexp$Ss$result, term_id %in% highTerms.overlap.lowexp$term_id) %>%
dplyr::select(term_id, term_name, intersection)
#Overlap btwn GO terms enriched in poorly expressed sequences and highest Str-codon-adapted sequences
temp <- dplyr::filter(highTerms.overlap.top,
term_id %in% highTerms.overlap.lowexp$term_id)
dplyr::filter(gost.results.lowexp$Ss$result, term_id %in% temp$term_id) %>%
dplyr::select(term_id, term_name)
#Overlap btwn GO terms enriched in poorly expressed sequences and poorest Str-codon-adapted sequences
temp <- dplyr::filter(highTerms.overlap.bot,
term_id %in% highTerms.overlap.lowexp$term_id)
dplyr::filter(gost.results.lowexp$Ss$result, term_id %in% temp$term_id) %>%
dplyr::select(term_id, term_name)
# Save distribution plot of codon adaptivness by species
ggsave('../Outputs/Codon_Adaptiveness_Distributions_by_Species.pdf',
plot = species_adaptiveness_plot,
width = 5.5, height = 4,
units = "in", device = "pdf",
useDingbats = FALSE)
# Save distribution plot of GC content by species
ggsave('../Outputs/GC_Distributions_by_Species.pdf',
plot = species_GC_plot,
width = 5.5, height = 4,
units = "in", device = "pdf",
useDingbats = FALSE)
# Save plot of CAI vs FLF expression level
ggsave('../Outputs/CAIvsFLFlog2CPM_by_Species.pdf',
plot = caiExp.highExp_plot,
width = 5.5, height = 3,
units = "in", device = "pdf",
useDingbats = FALSE)
write_excel <- function(data, sheet_facet, filename){
file <- paste(filename,".xlsx",sep = "")
# Workbook
to_download <<- createWorkbook()
sapply(seq_along(sheet_facet), function(x){
addWorksheet(wb = to_download, sheetName = sheet_facet[[x]])
sheet_data <- data %>%
dplyr::filter(species == sheet_facet[[x]]) %>%
dplyr::select(!species)
# # Write Data
# ## Sheet header
# writeData(
# to_download,
# sheet = 1,
# x = c(
# paste0("Strongyloides Codon Usage Report"),
# paste0("Report generated on ", format(Sys.Date(), "%B %d, %Y"))
# )
# )
#
## Results of codon usage analysis
writeData(
to_download,
sheet = x,
x = sheet_data,
startRow = 1,
startCol = 1,
headerStyle = createStyle(
textDecoration = "Bold",
halign = "center",
border = "bottom"
)
)
})
saveWorkbook(to_download, file, overwrite=TRUE)
}
# Save files with top and bottom percentiles of codon adapted genes per species
tbl %>%
dplyr::select(geneID, species_CAI, GC, Description, species) %>%
write_excel(levels(tbl$species), '../Outputs/Percentile_CAI_Genes')
# Save list of all GO terms for each species, and the genes id that intersect
# between the GO term and the query
# for the top and bottom percentile of codon adapted genes
enriched.GO.terms <- rbind(
lowCAI = lapply(names(gost.results.bot), function(y){
gost.results.bot[[y]]$result %>%
as_tibble() %>%
dplyr::select(-c(query, significant,parents)) %>%
dplyr::arrange(p_value) %>%
dplyr::mutate(species = y)
}) %>% bind_rows %>%
dplyr::mutate(species = factor(species, levels = c("Ss", "Sr", "Sp", "Sv","Ce"))) %>%
dplyr::mutate(Description = "Least-adapted genes") %>%
dplyr::relocate(Description,species,term_id, source, term_name, .before = everything()),
highCAI = lapply(names(gost.results.top), function(y){
gost.results.top[[y]]$result %>%
as_tibble() %>%
dplyr::select(-c(query, significant,parents)) %>%
dplyr::arrange(p_value) %>%
dplyr::mutate(species = y)
}) %>% bind_rows %>%
dplyr::mutate(species = factor(species, levels = c("Ss", "Sr", "Sp", "Sv","Ce"))) %>%
dplyr::mutate(Description = "Most-adapted genes") %>%
dplyr::relocate(Description,species,term_id, source, term_name, .before = everything())
)
write_excel(enriched.GO.terms, levels(enriched.GO.terms$species), '../Outputs/Percentile_GO_Analysis')
# Save list of common GO terms for each species, and the genes id that intersect
# between the GO term and the query
# for the top and bottom percentile of codon adapted genes and GT-rich and AT-rich subsets too
common.terms <- rbind (lowCAI = filter(enriched.GO.terms, term_id %in% highTerms.overlap.bot$term_id) %>%
dplyr::filter(species == "Ss") %>%
dplyr::select(term_id, source,term_name, Description),
highCAI = filter(enriched.GO.terms, term_id %in% highTerms.overlap.top$term_id) %>%
dplyr::filter(species == "Ss") %>%
dplyr::select(term_id, source, term_name, Description),
ATrich = dplyr::filter(gost.results.ATrich$Ss$result, term_id %in% highTerms.overlap.ATrich$term_id) %>%
dplyr::select(term_id, source, term_name) %>%
dplyr::mutate(Description = "AT-rich genes"),
GCrich = dplyr::filter(gost.results.GCrich$Ss$result, term_id %in% highTerms.overlap.GCrich$term_id) %>%
dplyr::select(term_id, source, term_name)%>%
dplyr::mutate(Description = "GC-rich genes")
) %>%
dplyr::mutate(Description = factor(Description)) %>%
dplyr::rename(species = Description)
write_excel(common.terms, levels(common.terms$species), '../Outputs/Commonly_Enriched_GO_Terms')
# Save Functional Enrichment Plots for bottom percentile
lapply(names(gost.results.bot), function(y){
gost.res <- gost.results.bot[[y]]
gost.ggplot <- gostplot(gost.res, interactive = F, capped = T) +
labs(title = paste(y, "Bottom", percentile*100,"% Codon Adapted Genes")) +
theme(plot.title.position = "plot",
plot.caption.position = "plot",
plot.title = element_text(face = "bold",
size = 8, hjust = 0))
gost.ggplot.pub<- publish_gostplot(gost.ggplot,
highlight_terms = highTerms.overlap.bot$term_id)
ggsave(paste0('../Outputs/',y,'_Bot_CAI_Plot.pdf'), plot = gost.ggplot.pub,
width = 7, height = 4,
units = "in", device = "pdf",
useDingbats = FALSE)
})
# Save Functional Enrichment Plots for top percentile
lapply(names(gost.results.top), function(y){
gost.res <- gost.results.top[[y]]
gost.ggplot <- gostplot(gost.res, interactive = F, capped = T) +
labs(title = paste(y, "Top", percentile*100,"% Codon Adapted Genes")) +
theme(plot.title.position = "plot",
plot.caption.position = "plot",
plot.title = element_text(face = "bold",
size = 8, hjust = 0))
gost.ggplot.pub<- publish_gostplot(gost.ggplot,
highlight_terms = highTerms.overlap.top$term_id)
ggsave(paste0('../Outputs/',y,'_Top_CAI_Plot.pdf'), plot = gost.ggplot.pub,
width = 7, height = 5,
units = "in", device = "pdf",
useDingbats = FALSE)
})